package cn.mengfanding.sman.dispatcher;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.log4j.Logger;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.mengfanding.sman.crawler.Crawler;
import cn.mengfanding.sman.text.TextMatcher;

/**
 * web excavator
 * 
 * @author mengfanding
 *
 */
public class Excavator {

	private static Set<String> beenPool = new HashSet<String>();// 已获取过的链接池
	private static Queue<SmanDocument> docPool = new LinkedList<SmanDocument>();
	private static Logger log = Logger.getLogger(Excavator.class);
	private static Queue<String> pool = new LinkedList<String>();// 待爬取链接池
	private static int topN = Integer.MAX_VALUE;

	private static int workingCount = 0;

	private static List<String> regx = new ArrayList<String>();

	public static void addURLRegx(String _regx) {
		regx.add(_regx);
	}

	public static boolean checkURL(String text) {
		if (regx.size() == 0)
			return true;
		for (String regEx : regx) {
			if (TextMatcher.matherRegText(regEx, text)) {
				return true;
			}
		}

		return false;
	}

	public static void addSeed(String _url) {
		URL url = null;
		try {
			url = new URL(_url);
			pool.offer(url.toString());
			log.info("add seed success : " + url.toString());
		} catch (MalformedURLException e) {
			log.error("add seed error");
		}

	}

	public static boolean isWorking() {
		if (workingCount > 0 || docPool.size() > 0) {
			return true;
		}

		return false;
	}

	public static void setTopN(int n) {
		if (n > 0)
			topN = n;
	}

	public static void show() {
		for (String string : beenPool) {
			log.info(string);
		}
		log.info("have been visited :" + beenPool.size());
		log.info("get page count :" + docPool.size());
	}

	public static void showTest() {
		log.info(t + " " + workingCount + " " + docPool.size() + " " + beenPool.size() + " " + pool.size());
	}

	public static void start(int size) {
		ExecutorService threadPool = Executors.newFixedThreadPool(size + 4);
		for (int i = 0; i < size; i++) {
			threadPool.execute(new Job());
		}
		threadPool.execute(new Transform());
//		threadPool.execute(new Transform());
//		threadPool.execute(new Transform());
		threadPool.execute(new Killer());// 根据深度、访问总数构造Killer
	}

	// TODO 添加规则使用map键值对，要对其先验证，在使用
	private static Map<String, Rule> rules = new HashMap<String, Rule>();

	public static void addRules(String reg, Rule rule) {
		Excavator.rules.put(reg, rule);
	}

	private static int t = 0;

	public static void transform() {
		t++;
		try {
			SmanDocument sd = docPool.poll();
			if (sd != null && rules.size() > 0) {
				Set<String> regs = rules.keySet();
				for (String reg : regs) {
					if (TextMatcher.matherRegText(reg, sd.getUrl())) {
						rules.get(reg).rule(sd.getDocument());
						break;
					}
				}
			}

		} catch (Exception e) {
			e.printStackTrace();
		}
		t--;

	}

	public static void visit() {
		if (beenPool.size() >= topN - workingCount) {
			log.debug("reached the upper");
			return;
		}
		String _url = pool.poll();
		if (_url == null) {
			log.debug("empty pool");
			return;
		}
		if (beenPool.contains(_url)) {
			log.debug("have been visited - " + _url);
			return;
		}

		workingCount++;
		try {
			URI base = new URI(_url);

			Document doc = Crawler.getDocByUrl(_url);
			docPool.offer(new SmanDocument(_url, doc)); // 将获取到的页面放入页面池中
			beenPool.add(_url); // 放入已访问池
			Elements alinks = doc.getElementsByTag("a");
			for (Element link : alinks) {
				String href = link.attr("href");
				if ("#".equals(href))
					continue;
				if(href.startsWith("javas"))
					continue;
//				log.warn(href);
				URI newLink = base.resolve(href);
//				String linkstr = URLEncoder.encode(newLink.toString(), "utf8");
				String linkstr =newLink.toString();
//				log.warn(linkstr);
				if (beenPool.contains(linkstr)) {
					log.debug("have been visited ,shuld not pull in pool - " + linkstr);
				} else {// TODO 匹配正则表达式
					if (checkURL(linkstr))
						pool.offer(linkstr);
				}
			}

		} catch (UnknownHostException e) {
			log.debug(e.getMessage()); // TODO 是否放回池中
		} catch (IllegalArgumentException e) {
			log.debug("url is illegal");
		} catch (Exception e) {
			log.warn(e);
			if (!beenPool.contains(_url)) {
				// pool.offer(_url);
				log.debug("put it back in pool - " + _url);
			}
		}
		workingCount--;

	}

	public static int visitedCount() {
		return beenPool.size();
	}
}

/**
 * Job
 * 
 * @author mengfanding
 *
 */
class Job extends Thread {

	@Override
	public void run() {
		while (true) {
			Excavator.visit();
		}
	}
}

/**
 * Killer
 * 
 * @author mengfanding
 *
 */
class Killer extends Thread {

	private Logger log = Logger.getLogger(Killer.class);

	private long start = new Date().getTime();

	@Override
	public void run() {

		while (true) {
			Excavator.showTest();
			if (!Excavator.isWorking()) {
				Excavator.show();
				long end = new Date().getTime();
				log.info("spend time : " + (end - start) / 1000 + "Sec");
				System.exit(0);
			}
			try {
				sleep(50);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
	}

}

class Transform extends Thread {
	@Override
	public void run() {
		while (true) {
			Excavator.transform();
			try {
				sleep(5); // 睡觉等待线程启动
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
	}
}